Range from 0 to 100
Linear model is poor representation of data due to generating predictions above 1 and below 0…
Model predictions don’t align with observed data
Heteroscedasticity
Non-linearity
Outcome of multiple binomial trials
Heteroscedasticity
#define range
p = seq(0,1, length=100)
#plot several Beta distributions
plot(p, dbeta(p, 2, 10), ylab='density', type ='l', col='purple')
lines(p, dbeta(p, 2, 2), col='red')
lines(p, dbeta(p, 5, 2), col='blue')
lines(p, dbeta(p, 2, 3), col='pink')
#add legend
legend(.7, 4, c('Beta(2, 10)','Beta(2, 2)','Beta(1,1)','Beta(2, 3)'),
lty=c(1,1,1),col=c('purple', 'red', 'blue','pink'))In beta regression, regressors can have effect on both mean and precision of distribution.
… But, beta distributions are contained between 0s and 1s but do not contain 0s and 1s. So we need to introduce a bernoulli distribution for 0s and 1s
Distinguishes between those who respond 1 and not 1, those who respond 0 and not 0 (logistic), and those who respond between 0 and 1 (beta distribution).
rzoib <- function(n = 1e4, alpha=.1, gamma=.45, mu=.4, phi=3) {
a <- mu * phi
b <- (1 - mu) * phi
y <- vector("numeric", n)
y <- ifelse(
rbinom(n, 1, alpha),
rbinom(n, 1, gamma),
rbeta(n, a, b)
)
y
}
set.seed(10)
groupA <- rzoib(100, a=.11, g=.9, m=.5, p=9)
groupB <- rzoib(100, a=.11, g=.1, m=.5, p=9)set.seed(10)
FPdf <- FPrate(n=100,a=.10,g=.5,m=.5,p=5,iter=10000)
paste0("False Positive Rate is: ", mean(FPdf$FP))[1] "False Positive Rate is: 0.0517"
set.seed(10)
FPdf <- FPrate(100, a=.10, g=.5, m=.2, p=6, iter=10000)
paste0("False Positive Rate is: ", mean(FPdf$FP))[1] "False Positive Rate is: 0.0459"
set.seed(10)
FPdf <- FPrate(100, a=.60, g=.80, m=.65, p=.8, iter=10000)
paste0("False Positive Rate is: ", mean(FPdf$FP))[1] "False Positive Rate is: 0.0523"
set.seed(10)
FPdf <- FPrate(100, a=.50, g=.80, m=.5, p=6, iter=10000)
paste0("False Positive Rate is: ", mean(FPdf$FP))[1] "False Positive Rate is: 0.0497"
powerEst <- function(n1,a1,g1,m1,p1,n2=NULL,a2=NULL,g2=NULL,m2=NULL,p2=NULL,iter){
mat <- matrix(nrow=iter,ncol=6)
for(i in 1:iter){
if(is.null(n2)){
n2=n1
}
if(is.null(a2)){
a2=a1
}
if(is.null(g2)){
g2=g1
}
if(is.null(m2)){
m2=m1
}
if(is.null(p2)){
p2=p1
}
groupA <- rzoib(n=n1, a=a1, g=g1, m=m1, p=p1)
groupB <- rzoib(n=n2, a=a2, g=g2, m=m2, p=p2)
d<- lsr::cohensD(groupA,groupB)
out<-t.test(groupA,groupB)
m <- mean(c(mean(groupA) - mean(groupB)))
s <- mean(c(sd(groupA),sd(groupB)))
pt <- power.t.test(n=round(mean(c(n1,n2))),delta=m,sd=s)
mat[i,] <- c(out$p.value,
out$p.value>.05,
m,
s,
d,
pt$power
)
}
df <- as.data.frame(mat)
colnames(df) <- c("p","Miss","meanDiff","sd","d","empiricalPower")
return(df)
}Start at .5 and increase distance between group A and group B beta distributions means by .005 starting at .005. Only 10% of data is zero-one inflated.
meanDifferences <- seq(from=.005,to=.25,by=.005)
library(parallel)
cl <- makeCluster(detectCores())
powDf <- mclapply(meanDifferences,
function(x)
powerEst(n1=100, a1=.1, g1=.5, m1=(.50 + x/2), p1=10, m2=(.50 - x/2),iter=10000)
)
powerCurveDf1 <- data.frame(d=unlist(lapply(powDf, function(x) mean(x[["d"]]) )),
pow=unlist(lapply(powDf, function(x) 1-mean(x[["Miss"]]) )),
type="Simulated"
)
powerCurveDf2 <- data.frame(d=unlist(lapply(powDf, function(x) mean(x[["d"]]) )),
pow=unlist(lapply(powDf, function(x) mean(x[["empiricalPower"]]) )),
type="Empirical"
)
powerCurveDf <- rbind(powerCurveDf1,powerCurveDf2)Underpowered/less sensitive at lower mean differences, but as mean differences/effect size increases, normality assumption of t-test may matter less.
Proportion of data that is either one or zero inflated set at 50% and proportion group A increasing one-inflated and group B increasingly zero-inflated. Small difference between beta distributions
inflation <- seq(from=.1,to=.75,by=.025)
cl <- makeCluster(detectCores())
powDf <- mclapply(inflation,
function(x)
powerEst
(n1=100, a1=.5, g1=x, m1=(.51), p1=10, m2=(.49),g2=1-x,iter=10000)
)
powerCurveDf1 <- data.frame(d=unlist(lapply(powDf, function(x) mean(x[["d"]]) )),
pow=unlist(lapply(powDf, function(x) 1-mean(x[["Miss"]]) )),
type="Simulated"
)
powerCurveDf2 <- data.frame(d=unlist(lapply(powDf, function(x) mean(x[["d"]]) )),
pow=unlist(lapply(powDf, function(x) mean(x[["empiricalPower"]]) )),
type="Empirical"
)
powerCurveDf <- rbind(powerCurveDf1,powerCurveDf2)Same deal
inflation <- seq(from=.1,to=.75,by=.025)
cl <- makeCluster(detectCores())
powDf <- mclapply(inflation,
function(x)
powerEst(n1=100, a1=.5, g1=x, m1=.755, p1=10, m2=.765,g2=.1,iter=10000)
)
powerCurveDf1 <- data.frame(d=unlist(lapply(powDf, function(x) mean(x[["d"]]) )),
pow=unlist(lapply(powDf, function(x) 1-mean(x[["Miss"]]) )),
type="Simulated"
)
powerCurveDf2 <- data.frame(d=unlist(lapply(powDf, function(x) mean(x[["d"]]) )),
pow=unlist(lapply(powDf, function(x) mean(x[["empiricalPower"]]) )),
type="Empirical"
)
powerCurveDf <- rbind(powerCurveDf1,powerCurveDf2)set.seed(20)
groupA <- rzoib(n=100,alpha=.15,gamma=.72, mu=.53,phi=4)
groupB <- rzoib(n=100,alpha=.15,gamma=.18,mu=.50,phi=4)
simDf <- data.frame(DV=c(groupA,groupB),group=c(rep("A",100),rep("B",100)))
zoib_model <- bf(
DV ~ group,
phi ~ group,
zoi ~ group,
coi ~ group,
family = zero_one_inflated_beta()
) Family: zero_one_inflated_beta
Links: mu = logit; phi = log; zoi = logit; coi = logit
Formula: DV ~ group
phi ~ group
zoi ~ group
coi ~ group
Data: simDf (Number of observations: 200)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -0.11 0.10 -0.30 0.08 1.00 6038 3173
phi_Intercept 1.27 0.14 1.00 1.53 1.00 6134 2996
zoi_Intercept -1.58 0.26 -2.11 -1.08 1.00 6756 3025
coi_Intercept 0.34 0.50 -0.64 1.34 1.00 7564 3474
groupB 0.10 0.15 -0.19 0.38 1.00 6497 3139
phi_groupB -0.28 0.19 -0.65 0.10 1.00 6692 3090
zoi_groupB -0.33 0.40 -1.11 0.44 1.00 6088 2651
coi_groupB 0.48 0.82 -1.09 2.13 1.00 7146 2677
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
# Following the Sequential Effect eXistence and sIgnificance Testing (SEXIT) framework, we report the median of the posterior distribution and its 95% CI (Highest Density Interval), along the probability of direction (pd), the probability of significance and the probability of being large. The thresholds beyond which the effect is considered as significant (i.e., non-negligible) and large are |0.05| and |0.30|.
- b_Intercept (Median = -0.11, 95% CI [-0.30, 0.08]) has a 87.67% probability of being negative (< 0), 74.28% of being significant (< -0.05), and 2.88% of being large (< -0.30)
- b_phi_Intercept (Median = 1.27, 95% CI [1.00, 1.53]) has a 100.00% probability of being positive (> 0), 100.00% of being significant (> 0.05), and 100.00% of being large (> 0.30)
- b_zoi_Intercept (Median = -1.57, 95% CI [-2.11, -1.08]) has a 100.00% probability of being negative (< 0), 100.00% of being significant (< -0.05), and 100.00% of being large (< -0.30)
- b_coi_Intercept (Median = 0.34, 95% CI [-0.64, 1.34]) has a 75.95% probability of being positive (> 0), 72.10% of being significant (> 0.05), and 53.00% of being large (> 0.30)
- b_groupB (Median = 0.10, 95% CI [-0.19, 0.38]) has a 75.10% probability of being positive (> 0), 63.38% of being significant (> 0.05), and 8.45% of being large (> 0.30)
- b_phi_groupB (Median = -0.28, 95% CI [-0.65, 0.10]) has a 92.67% probability of being negative (< 0), 88.05% of being significant (< -0.05), and 45.45% of being large (< -0.30)
- b_zoi_groupB (Median = -0.32, 95% CI [-1.11, 0.44]) has a 79.53% probability of being negative (< 0), 76.08% of being significant (< -0.05), and 52.58% of being large (< -0.30)
- b_coi_groupB (Median = 0.47, 95% CI [-1.09, 2.13]) has a 72.30% probability of being positive (> 0), 70.23% of being significant (> 0.05), and 58.58% of being large (> 0.30)
Parameter | Median | 95% CI | Direction | Significance (> |0.05|) | Large (> |0.30|)
------------------------------------------------------------------------------------------------
Intercept | -0.11 | [-0.30, 0.08] | 0.88 | 0.74 | 0.03
phi_Intercept | 1.27 | [1.00, 1.53] | 1.00 | 1.00 | 1.00
zoi_Intercept | -1.57 | [-2.11, -1.08] | 1.00 | 1.00 | 1.00
coi_Intercept | 0.34 | [-0.64, 1.34] | 0.76 | 0.72 | 0.53
groupB | 0.10 | [-0.19, 0.38] | 0.75 | 0.63 | 0.08
phi_groupB | -0.28 | [-0.65, 0.10] | 0.93 | 0.88 | 0.45
zoi_groupB | -0.32 | [-1.11, 0.44] | 0.80 | 0.76 | 0.53
coi_groupB | 0.47 | [-1.09, 2.13] | 0.72 | 0.70 | 0.59
Call:
lm(formula = DV ~ group, data = simDf)
Residuals:
Min 1Q Median 3Q Max
-0.54946 -0.21032 -0.01092 0.21922 0.60733
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.54946 0.02731 20.118 < 2e-16 ***
groupB -0.15679 0.03862 -4.059 7.08e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2731 on 198 degrees of freedom
Multiple R-squared: 0.07683, Adjusted R-squared: 0.07216
F-statistic: 16.48 on 1 and 198 DF, p-value: 7.084e-05
“The ZOIB has issues with overfitting the data by fitting multiple sets of parameters to degenerate (bounded) and continuous responses separately.”
Primary issue is it assumes the processes of 0, 1, and continuous beta are completely independent processes.
ordered cut points, similar in spirit to an ordered logit model, to estimate the joint probability of 0s (the lower bound), continuous proportions, and 1s (the upper bound) in bounded continuous data.
As only one predictive model is used for all of the outcomes, the effect of covariates is identified across degenerate and continuous observations without resulting in overfitting.
The use of cut points permits the model to fit distributions with mostly degenerate observations or no degenerate observations at all, which makes it a general solution to this problem.
Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Users/jacobelder/Library/R/x86_64/4.2/library/Rcpp/include/" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/unsupported" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/BH_1.78.0-0/BH/include" -I"/Users/jacobelder/Library/R/x86_64/4.2/library/StanHeaders/include/src/" -I"/Users/jacobelder/Library/R/x86_64/4.2/library/StanHeaders/include/" -I"/Users/jacobelder/Library/R/x86_64/4.2/library/RcppParallel/include/" -I"/Users/jacobelder/Library/R/x86_64/4.2/library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/Users/jacobelder/Library/R/x86_64/4.2/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fPIC -Wall -g -O2 -c foo.c -o foo.o
In file included from <built-in>:1:
In file included from /Users/jacobelder/Library/R/x86_64/4.2/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
In file included from /Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/Dense:1:
In file included from /Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/Core:88:
/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name 'namespace'
namespace Eigen {
^
/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:16: error: expected ';' after top level declarator
namespace Eigen {
^
;
In file included from <built-in>:1:
In file included from /Users/jacobelder/Library/R/x86_64/4.2/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
In file included from /Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/Dense:1:
/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
#include <complex>
^~~~~~~~~
3 errors generated.
make: *** [foo.o] Error 1
Family: ord_beta_reg
Links: mu = identity; phi = identity; cutzero = identity; cutone = identity
Formula: DV ~ 0 + Intercept + +group
Data: data (Number of observations: 200)
Draws: 2 chains, each with iter = 1000; warmup = 500; thin = 1;
total post-warmup draws = 1000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 0.13 0.09 -0.06 0.31 1.00 784 395
groupB -0.43 0.13 -0.68 -0.18 1.00 702 711
Family Specific Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
phi 4.19 0.38 3.48 4.98 1.00 922 717
cutzero -2.45 0.27 -2.98 -1.95 1.00 900 764
cutone 1.61 0.07 1.46 1.76 1.00 798 426
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Call:
lm(formula = outTherm ~ CSEid, data = indDiffs1)
Residuals:
Min 1Q Median 3Q Max
-0.32673 -0.16784 -0.06314 0.11216 0.80937
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.36562 0.03427 10.668 < 2e-16 ***
CSEid -0.03889 0.00816 -4.766 2.82e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2255 on 331 degrees of freedom
(51 observations deleted due to missingness)
Multiple R-squared: 0.06421, Adjusted R-squared: 0.06138
F-statistic: 22.71 on 1 and 331 DF, p-value: 2.823e-06
zoib_model <- bf(
outTherm ~ CSEid,
phi ~ CSEid,
zoi ~ CSEid,
coi ~ CSEid,
family = zero_one_inflated_beta()
)
ncores=detectCores()
zoib.m <- brm(
formula = zoib_model,
data = indDiffs1,
silent=2, refresh=0, cores=ncores, refresh=0
)Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/Rcpp_1.0.9/Rcpp/include/" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/unsupported" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/BH_1.78.0-0/BH/include" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/StanHeaders_2.21.0-7/StanHeaders/include/src/" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/StanHeaders_2.21.0-7/StanHeaders/include/" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppParallel_5.1.5/RcppParallel/include/" -I"/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/rstan_2.21.5/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/StanHeaders_2.21.0-7/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fPIC -Wall -g -O2 -c foo.c -o foo.o
In file included from <built-in>:1:
In file included from /Users/jacobelder/R_groundhog/groundhog_library/R-4.2/StanHeaders_2.21.0-7/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
In file included from /Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/Dense:1:
In file included from /Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/Core:88:
/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name 'namespace'
namespace Eigen {
^
/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:16: error: expected ';' after top level declarator
namespace Eigen {
^
;
In file included from <built-in>:1:
In file included from /Users/jacobelder/R_groundhog/groundhog_library/R-4.2/StanHeaders_2.21.0-7/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
In file included from /Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/Dense:1:
/Users/jacobelder/R_groundhog/groundhog_library/R-4.2/RcppEigen_0.3.3.9.2/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
#include <complex>
^~~~~~~~~
3 errors generated.
make: *** [foo.o] Error 1
Family: zero_one_inflated_beta
Links: mu = logit; phi = log; zoi = logit; coi = logit
Formula: outTherm ~ CSEid
phi ~ CSEid
zoi ~ CSEid
coi ~ CSEid
Data: indDiffs1 (Number of observations: 333)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -0.32 0.18 -0.66 0.03 1.00 5862 3323
phi_Intercept 1.37 0.26 0.87 1.85 1.00 5479 3430
zoi_Intercept -2.35 0.40 -3.16 -1.61 1.00 4852 2935
coi_Intercept -3.37 1.80 -7.32 -0.28 1.00 4741 2655
CSEid -0.16 0.05 -0.25 -0.07 1.00 4919 3177
phi_CSEid -0.03 0.07 -0.15 0.10 1.00 4991 3551
zoi_CSEid 0.31 0.09 0.14 0.48 1.00 5381 3180
coi_CSEid -0.07 0.40 -0.84 0.72 1.00 4638 3056
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Family: ord_beta_reg
Links: mu = identity; phi = identity; cutzero = identity; cutone = identity
Formula: outTherm ~ 0 + Intercept + +CSEid
Data: data (Number of observations: 333)
Draws: 2 chains, each with iter = 1000; warmup = 500; thin = 1;
total post-warmup draws = 1000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -0.22 0.14 -0.51 0.06 1.01 612 557
CSEid -0.19 0.04 -0.26 -0.12 1.01 617 577
Family Specific Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
phi 3.64 0.29 3.07 4.23 1.00 911 712
cutzero -2.10 0.14 -2.37 -1.84 1.00 904 711
cutone 1.80 0.10 1.62 2.00 1.00 906 588
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).